import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score, roc_curve, roc_auc_score
Objective: The Objective of the project is to accurately predict the chances of developing lung cancer based off the genetic risk target variable.
Background: Cancer is a disease in which cells in the body grow out of control. When cancer starts in the lungs, it is called lung cancer.
Lung cancer begins in the lungs and may spread to lymph nodes or other organs in the body, such as the brain. Cancer from other organs also may spread to the lungs. When cancer cells spread from one organ to another, they are called metastases.
Lung cancers usually are grouped into two main types called small cell and non-small cell (including adenocarcinoma and squamous cell carcinoma). These types of lung cancer grow differently and are treated differently. Non-small cell lung cancer is more common than small cell lung cancer.
dataset = pd.read_csv('/Users/abdihussein/Downloads/cancer patient data sets.csv', sep = ',')
dataset.head()
| index | Patient Id | Age | Gender | Air Pollution | Alcohol use | Dust Allergy | OccuPational Hazards | Genetic Risk | chronic Lung Disease | ... | Fatigue | Weight Loss | Shortness of Breath | Wheezing | Swallowing Difficulty | Clubbing of Finger Nails | Frequent Cold | Dry Cough | Snoring | Level | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | P1 | 33 | 1 | 2 | 4 | 5 | 4 | 3 | 2 | ... | 3 | 4 | 2 | 2 | 3 | 1 | 2 | 3 | 4 | Low |
| 1 | 1 | P10 | 17 | 1 | 3 | 1 | 5 | 3 | 4 | 2 | ... | 1 | 3 | 7 | 8 | 6 | 2 | 1 | 7 | 2 | Medium |
| 2 | 2 | P100 | 35 | 1 | 4 | 5 | 6 | 5 | 5 | 4 | ... | 8 | 7 | 9 | 2 | 1 | 4 | 6 | 7 | 2 | High |
| 3 | 3 | P1000 | 37 | 1 | 7 | 7 | 7 | 7 | 6 | 7 | ... | 4 | 2 | 3 | 1 | 4 | 5 | 6 | 7 | 5 | High |
| 4 | 4 | P101 | 46 | 1 | 6 | 8 | 7 | 7 | 7 | 6 | ... | 3 | 2 | 4 | 1 | 4 | 2 | 4 | 2 | 3 | High |
5 rows × 26 columns
dataset.isnull().sum()
index 0 Patient Id 0 Age 0 Gender 0 Air Pollution 0 Alcohol use 0 Dust Allergy 0 OccuPational Hazards 0 Genetic Risk 0 chronic Lung Disease 0 Balanced Diet 0 Obesity 0 Smoking 0 Passive Smoker 0 Chest Pain 0 Coughing of Blood 0 Fatigue 0 Weight Loss 0 Shortness of Breath 0 Wheezing 0 Swallowing Difficulty 0 Clubbing of Finger Nails 0 Frequent Cold 0 Dry Cough 0 Snoring 0 Level 0 dtype: int64
dataset = dataset.iloc[:, 2:]
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1000 non-null int64 1 Gender 1000 non-null int64 2 Air Pollution 1000 non-null int64 3 Alcohol use 1000 non-null int64 4 Dust Allergy 1000 non-null int64 5 OccuPational Hazards 1000 non-null int64 6 Genetic Risk 1000 non-null int64 7 chronic Lung Disease 1000 non-null int64 8 Balanced Diet 1000 non-null int64 9 Obesity 1000 non-null int64 10 Smoking 1000 non-null int64 11 Passive Smoker 1000 non-null int64 12 Chest Pain 1000 non-null int64 13 Coughing of Blood 1000 non-null int64 14 Fatigue 1000 non-null int64 15 Weight Loss 1000 non-null int64 16 Shortness of Breath 1000 non-null int64 17 Wheezing 1000 non-null int64 18 Swallowing Difficulty 1000 non-null int64 19 Clubbing of Finger Nails 1000 non-null int64 20 Frequent Cold 1000 non-null int64 21 Dry Cough 1000 non-null int64 22 Snoring 1000 non-null int64 23 Level 1000 non-null object dtypes: int64(23), object(1) memory usage: 187.6+ KB
a=dataset.hist(figsize=(15,15))
fig = px.histogram(dataset, x = 'Age', color = 'Level', nbins = 10, title = 'Distribution of ages', text_auto = True)
fig.show()
fig = px.box(dataset, x = 'Age', y = 'Level')
fig.show()
plt.figure(figsize = (6,4))
ax = sns.countplot(dataset.Level)
for bars in ax.containers:
ax.bar_label(bars)
plt.title("Count of Levels", fontsize = 15);
def graph(x, y, title):
fig = px.histogram(dataset, x = x, y = y, color = 'Level', barmode = 'group',
title = title, text_auto = True)
fig.show()
fig = px.box(dataset, x = x, y = 'Level')
fig.show()
#graph('Alcohol use', 'Gender', 'Distribution of Alcohol use')
graph('Genetic Risk', 'Gender', 'Distribution of Genetic Risk')
fig = px.histogram(dataset, x = 'Level', y = 'Gender', color = 'Gender', barmode = 'group',
title = 'Distribution of Level', text_auto = True)
fig.show()
It's possible to understand that according the level of column is increasing, higher is the level of people risk. There are few examples where exist some outliers. Perhaps, the medium risk isn't so objective, because it's the average between high and low values, and it isn't such a clear category to define specifically.
dataset.describe()
| Age | Gender | Air Pollution | Alcohol use | Dust Allergy | OccuPational Hazards | Genetic Risk | chronic Lung Disease | Balanced Diet | Obesity | ... | Coughing of Blood | Fatigue | Weight Loss | Shortness of Breath | Wheezing | Swallowing Difficulty | Clubbing of Finger Nails | Frequent Cold | Dry Cough | Snoring | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.0000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | ... | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
| mean | 37.174000 | 1.402000 | 3.8400 | 4.563000 | 5.165000 | 4.840000 | 4.580000 | 4.380000 | 4.491000 | 4.465000 | ... | 4.859000 | 3.856000 | 3.855000 | 4.240000 | 3.777000 | 3.746000 | 3.923000 | 3.536000 | 3.853000 | 2.926000 |
| std | 12.005493 | 0.490547 | 2.0304 | 2.620477 | 1.980833 | 2.107805 | 2.126999 | 1.848518 | 2.135528 | 2.124921 | ... | 2.427965 | 2.244616 | 2.206546 | 2.285087 | 2.041921 | 2.270383 | 2.388048 | 1.832502 | 2.039007 | 1.474686 |
| min | 14.000000 | 1.000000 | 1.0000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 27.750000 | 1.000000 | 2.0000 | 2.000000 | 4.000000 | 3.000000 | 2.000000 | 3.000000 | 2.000000 | 3.000000 | ... | 3.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 |
| 50% | 36.000000 | 1.000000 | 3.0000 | 5.000000 | 6.000000 | 5.000000 | 5.000000 | 4.000000 | 4.000000 | 4.000000 | ... | 4.000000 | 3.000000 | 3.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 3.000000 | 4.000000 | 3.000000 |
| 75% | 45.000000 | 2.000000 | 6.0000 | 7.000000 | 7.000000 | 7.000000 | 7.000000 | 6.000000 | 7.000000 | 7.000000 | ... | 7.000000 | 5.000000 | 6.000000 | 6.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 6.000000 | 4.000000 |
| max | 73.000000 | 2.000000 | 8.0000 | 8.000000 | 8.000000 | 8.000000 | 7.000000 | 7.000000 | 7.000000 | 7.000000 | ... | 9.000000 | 9.000000 | 8.000000 | 9.000000 | 8.000000 | 8.000000 | 9.000000 | 7.000000 | 7.000000 | 7.000000 |
8 rows × 23 columns
dataset
| Age | Gender | Air Pollution | Alcohol use | Dust Allergy | OccuPational Hazards | Genetic Risk | chronic Lung Disease | Balanced Diet | Obesity | ... | Fatigue | Weight Loss | Shortness of Breath | Wheezing | Swallowing Difficulty | Clubbing of Finger Nails | Frequent Cold | Dry Cough | Snoring | Level | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 33 | 1 | 2 | 4 | 5 | 4 | 3 | 2 | 2 | 4 | ... | 3 | 4 | 2 | 2 | 3 | 1 | 2 | 3 | 4 | Low |
| 1 | 17 | 1 | 3 | 1 | 5 | 3 | 4 | 2 | 2 | 2 | ... | 1 | 3 | 7 | 8 | 6 | 2 | 1 | 7 | 2 | Medium |
| 2 | 35 | 1 | 4 | 5 | 6 | 5 | 5 | 4 | 6 | 7 | ... | 8 | 7 | 9 | 2 | 1 | 4 | 6 | 7 | 2 | High |
| 3 | 37 | 1 | 7 | 7 | 7 | 7 | 6 | 7 | 7 | 7 | ... | 4 | 2 | 3 | 1 | 4 | 5 | 6 | 7 | 5 | High |
| 4 | 46 | 1 | 6 | 8 | 7 | 7 | 7 | 6 | 7 | 7 | ... | 3 | 2 | 4 | 1 | 4 | 2 | 4 | 2 | 3 | High |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | 44 | 1 | 6 | 7 | 7 | 7 | 7 | 6 | 7 | 7 | ... | 5 | 3 | 2 | 7 | 8 | 2 | 4 | 5 | 3 | High |
| 996 | 37 | 2 | 6 | 8 | 7 | 7 | 7 | 6 | 7 | 7 | ... | 9 | 6 | 5 | 7 | 2 | 4 | 3 | 1 | 4 | High |
| 997 | 25 | 2 | 4 | 5 | 6 | 5 | 5 | 4 | 6 | 7 | ... | 8 | 7 | 9 | 2 | 1 | 4 | 6 | 7 | 2 | High |
| 998 | 18 | 2 | 6 | 8 | 7 | 7 | 7 | 6 | 7 | 7 | ... | 3 | 2 | 4 | 1 | 4 | 2 | 4 | 2 | 3 | High |
| 999 | 47 | 1 | 6 | 5 | 6 | 5 | 5 | 4 | 6 | 7 | ... | 8 | 7 | 9 | 2 | 1 | 4 | 6 | 7 | 2 | High |
1000 rows × 24 columns
dataset['Level'].replace(to_replace=["Low", "Medium", "High"], value = [1, 2, 3], inplace = True)
standard = StandardScaler()
dataset_standard = standard.fit_transform(dataset)
dataset_standard_2 = pd.DataFrame(data = dataset_standard, columns = dataset.keys())
correlation = dataset_standard_2.corr()
graph = sns.heatmap(correlation, annot = True, fmt = '.2f')
graph.figure.set_size_inches(24, 16)
graph.set_title('Correlation Matriz', fontsize = 30)
Text(0.5, 1.0, 'Correlation Matriz')
X = dataset_standard_2.iloc[:, :23].values
y = dataset.iloc[:, 23].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)
from sklearn.metrics import confusion_matrix
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb_predict = bnb.predict(X_test)
accuracy_bnb = accuracy_score(y_test, bnb_predict)
confusion_matrix = confusion_matrix(y_test, bnb_predict)
plt.figure(figsize = (6,6))
sns.set(font_scale = 1)
sns.heatmap(confusion_matrix, annot = True, fmt = '.1f').set(xlabel = 'Prediction', ylabel = 'Real')
print('Accuracy: {:.2f}'.format(accuracy_bnb))
Accuracy: 0.82
from sklearn.metrics import confusion_matrix
svc = LinearSVC()
svc.fit(X_train, y_train)
svc_predict = svc.predict(X_test)
accuracy_svc = accuracy_score(y_test, svc_predict)
confusion_matrix = confusion_matrix(y_test, svc_predict)
plt.figure(figsize = (6,6))
sns.set(font_scale = 1)
sns.heatmap(confusion_matrix, annot = True, fmt = '.1f').set(xlabel = 'Prediction', ylabel = 'Real')
print('Accuracy: {:.2f}'.format(accuracy_svc))
Accuracy: 1.00
from sklearn.metrics import confusion_matrix
dtc = DecisionTreeClassifier(criterion='entropy', random_state = 1234)
dtc.fit(X_train, y_train)
dtc_predict = dtc.predict(X_test)
accuracy_dtc = accuracy_score(y_test, dtc_predict)
confusion_matrix = confusion_matrix(y_test, dtc_predict)
plt.figure(figsize = (6,6))
sns.set(font_scale = 1)
sns.heatmap(confusion_matrix, annot = True, fmt = '.1f').set(xlabel = 'Prediction', ylabel = 'Real')
print('Accuracy: {:.2f}'.format(accuracy_dtc))
Accuracy: 1.00
from sklearn.metrics import confusion_matrix
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)
rfc_predict = rfc.predict(X_test)
accuracy_rfc = accuracy_score(y_test, rfc_predict)
confusion_matrix = confusion_matrix(y_test, rfc_predict)
plt.figure(figsize = (6,6))
sns.set(font_scale = 1)
sns.heatmap(confusion_matrix, annot = True, fmt = '.1f').set(xlabel = 'Prediction', ylabel = 'Real')
print('Accuracy: {:.2f}'.format(accuracy_rfc))
Accuracy: 1.00
lb = LabelBinarizer()
y_test = lb.fit_transform(y_test)
bnb_predict = lb.fit_transform(bnb_predict)
svc_predict = lb.fit_transform(svc_predict)
knn_predict = lb.fit_transform(knn_predict)
dtc_predict = lb.fit_transform(dtc_predict)
rfc_predict = lb.fit_transform(rfc_predict)
bnb_auc = [0] * 3
bnb_fpr = [0] * 3
bnb_tpr = [0] * 3
bnb_thresholds = [0] * 3
svc_auc = [0] * 3
svc_fpr = [0] * 3
svc_tpr = [0] * 3
svc_thresholds = [0] * 3
knn_auc = [0] * 3
knn_fpr = [0] * 3
knn_tpr = [0] * 3
knn_thresholds = [0] * 3
dtc_auc = [0] * 3
dtc_fpr = [0] * 3
dtc_tpr = [0] * 3
dtc_thresholds = [0] * 3
rfc_auc = [0] * 3
rfc_fpr = [0] * 3
rfc_tpr = [0] * 3
rfc_thresholds = [0] * 3
for i in range(len(bnb_auc)):
bnb_auc[i] = roc_auc_score(y_test[:, i], bnb_predict[:, i], multi_class = 'ovr', average = 'macro')
svc_auc[i] = roc_auc_score(y_test[:, i], svc_predict[:, i], multi_class = 'ovr', average = 'macro')
knn_auc[i] = roc_auc_score(y_test[:, i], knn_predict[:, i], multi_class = 'ovr', average = 'macro')
dtc_auc[i] = roc_auc_score(y_test[:, i], dtc_predict[:, i], multi_class = 'ovr', average = 'macro')
rfc_auc[i] = roc_auc_score(y_test[:, i], rfc_predict[:, i], multi_class = 'ovr', average = 'macro')
bnb_fpr[i], bnb_tpr[i], bnb_thresholds[i] = roc_curve(y_test[:, i], bnb_predict[:, i])
svc_fpr[i], svc_tpr[i], svc_thresholds[i] = roc_curve(y_test[:, i], svc_predict[:, i])
knn_fpr[i], knn_tpr[i], knn_thresholds[i] = roc_curve(y_test[:, i], knn_predict[:, i])
dtc_fpr[i], dtc_tpr[i], dtc_thresholds[i] = roc_curve(y_test[:, i], dtc_predict[:, i])
rfc_fpr[i], rfc_tpr[i], rfc_thresholds[i] = roc_curve(y_test[:, i], rfc_predict[:, i])
plt.figure(figsize = (7, 7))
plt.plot(dtc_fpr[0], dtc_tpr[0], label = 'Dec. Tree Class. Model (Low) - ROC Curve (area = %.2f)' %dtc_auc[0])
plt.plot(dtc_fpr[1], dtc_tpr[1], label = 'Dec. Tree Class. Model (Medium) - ROC Curve (area = %.2f)' %dtc_auc[1])
plt.plot(dtc_fpr[2], dtc_tpr[2], label = 'Dec. Tree Class. Model (High) - ROC Curve (area = %.2f)' %dtc_auc[2])
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'r', label = 'Random guess')
plt.title('ROC curve (Decision Tree Classifier)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.legend()
plt.show()
Except the Naive Bayes, the rest of models have satisfied number of accuracy. However, the test data was small (250) to be sure that these models will be right every time when it insert new informations to predict. It'll be necessary a bigger dataset to train more and to analysis if the accuracy'll be the same or it will be decrease.